/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE. */

#pragma once

#include "top.hpp"
#include "device/device.hpp"
#include "device/devhcmessages.hpp"
#include <cstddef>

#if defined(__clang__)
#if __has_feature(address_sanitizer)
#include "device/devurilocator.hpp"
#endif
#endif

namespace amd {

/** \file Support for invoking host services from the device.
 *
 *  A hostcall is a fixed-size request generated by a kernel running
 *  on the device, for some predefined service provided by the
 *  host. The life-cycle of a hostcall is as follows:
 *
 *  1. A workitem in the some kernel dispatch submits a request as a
 *     "packet" in a "hostcall buffer". The workitem blocks until it
 *     receives a response from the host.
 *
 *  2. A host thread called the "hostcall listener" notices the packet
 *     and invokes the desired service on the host.
 *
 *  3. When the service completes, the listener copies the response
 *     into the request packet. This unblocks the workitem, and the
 *     hostcall is said to be completed.
 *
 *  The hostcall listeners and buffers are managed by the VDI
 *  runtime. The typical flow is as follows:
 *
 *  - Create and launch one or more hostcall listeners.
 *
 *  - Create and initialize a distinct hostcall buffer for each
 *    command queue in hardware (e.g., an hsa_queue_t on ROCm).
 *
 *  - Register this buffer with the appropriate listener.
 *
 *  - When a buffer is no longer used, deregister and then free
 *    it. This usually happens when the corresponding hardware queue
 *    is freed.
 *
 *  - Destroy the listener(s) when they are no longer required. This must be
 *    done before exiting the application, so that the listener
 *    threads can join() correctly.
 *
 *  A single listener is sufficient to correctly handle all hostcall
 *  buffers created in the application. The client may also launch
 *  multiple listeners, as long the same hostcall buffer is not
 *  registered with multiple listeners.
 */

/** \brief Determine the buffer size to be allocated
 *  \param num_packets Number of packets to be supported.
 *  \return Required size, including any internal padding required for
 *          the packets and their headers.
 */
size_t getHostcallBufferSize(uint32_t num_packets);

/** \brief Return the required alignment for a hostcall buffer.
 */
uint32_t getHostcallBufferAlignment(void);

bool enableHostcalls(const amd::Device& dev, void* buffer, uint32_t numPackets);
void disableHostcalls(void* buffer);

enum SignalValue { SIGNAL_DONE = 0, SIGNAL_INIT = 1 };

/** \brief Packet payload
 *
 *  Contains 64 slots of 8 ulongs each, one for each workitem in the
 *  wave. A slot with index \c i contains valid data if the
 *  corresponding bit in PacketHeader::activemask is set.
 */
struct Payload {
  uint64_t slots[64][8];
};

/** Packet header */
struct PacketHeader {
  /** Tagged pointer to the next packet in an intrusive stack */
  uint64_t next_;
  /** Bitmask that represents payload slots with valid data */
  uint64_t activemask_;
  /** Service ID requested by the wave */
  uint32_t service_;
  /** Control bits.
   *  \li 0: \c READY flag. Indicates packet awaiting a host response.
   */
  std::atomic<uint32_t> control_;
};

static_assert(std::is_standard_layout<PacketHeader>::value,
              "the hostcall packet must be useable from other languages");

/** Field offsets in the packet control field */
enum ControlOffset {
  CONTROL_OFFSET_READY_FLAG = 0,
  CONTROL_OFFSET_RESERVED0 = 1,
};

/** Field widths in the packet control field */
enum ControlWidth {
  CONTROL_WIDTH_READY_FLAG = 1,
  CONTROL_WIDTH_RESERVED0 = 31,
};

/** \brief Shared buffer submitting hostcall requests.
 *
 *  Holds hostcall packets requested by all kernels executing on the
 *  same device queue. Each hostcall buffer is associated with at most
 *  one device queue.
 *
 *  Packets in the buffer are accessed using 64-bit tagged pointers to mitigate
 *  the ABA problem in lock-free stacks. The index_mask is used to extract the
 *  lower bits of the pointer, which form the index into the packet array. The
 *  remaining higher bits define a tag that is incremented on every pop from a
 *  stack.
 */
class HostcallBuffer {
  /** Array of packet headers */
  PacketHeader* headers_;
  /** Array of packet payloads */
  Payload* payloads_;
  /** Signal used by kernels to indicate new work */
  void* doorbell_;
  /** Stack of free packets. Uses tagged pointers. */
  uint64_t free_stack_;
  /** Stack of ready packets. Uses tagged pointers */
  std::atomic<uint64_t> ready_stack_;
  /** Mask for accessing the packet index in the tagged pointer. */
  uint64_t index_mask_;
  /** Some services need a device**/
  const  amd::Device* device_;

  PacketHeader* getHeader(uint64_t ptr) const;
  Payload* getPayload(uint64_t ptr) const;

 public:
  void processPackets(MessageHandler& messages);
  void initialize(uint32_t num_packets);
  void setDoorbell(void* doorbell) { doorbell_ = doorbell; };
  void setDevice(const amd::Device* dptr) { device_ = dptr; };

 #if defined(__clang__)
 #if __has_feature(address_sanitizer)
 private:
  device::UriLocator* uri_locator;
 public:
  void setUriLocator(device::UriLocator* uri_l) { uri_locator = uri_l; };
 #endif
 #endif
};

static_assert(std::is_standard_layout<HostcallBuffer>::value,
              "the hostcall buffer must be useable from other languages");

}// namespace amd